knitr::opts_chunk$set(echo = TRUE, warning = FALSE, comment = FALSE, message = FALSE,
                      cache = FALSE)

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(xgboost)
## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice
library(Metrics)
library(ggpmisc)
## Loading required package: ggpp
## 
## Attaching package: 'ggpp'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(ggthemes)

match_dir = 'data/matchups/'
model_dir = 'data/models/'


# Set random seed
set.seed(799)

Purpose

The purpose of this script is to apply the xgboost algorithm to Remote Sensing Imagery of Lake Yojoa in Honduras, to estimate Yojoa water clarity. You can read more about this lake here.

In this very stringent approach, we subset the Secchi-Landsat matchups by Landsat flyover, resulting in no image-date crossover between the train, test, or validation datasets. We then create a weighted feature that awards the model for estimating higher Secchi values well disproportionate to lower Secchi values.

Load matchup data

We’ll load the environment variables from script 9 here to compare apples-to-apples.

load('data/models/train_test_val_verystringent.RData')

We want to predict the secchi value in these datasets, so let’s set the target as that variable:

## Identify our target (value is secchi)
target <- 'secchi'

xgboost Runs

Here, we develop models for 3 days, 5 days, and jemma-special (7 days except 1 day in oct/nov) matchup datasets.

Name feature groups

Here, we indicate the features to be used in our models. We’ll use the visual bands and add in summaries of ERA5 met data. In our datasets, the 5-day met summaries have the suffix ‘_5’, etc.

band_met3_feats <-  c('med_Blue_corr', 'med_Green_corr', 'med_Red_corr', 'med_Nir_corr',
                     'RN', 'BG', 'RB','GB',
                     'tot_sol_rad_KJpm2_3', 'max_temp_degK_3', 'mean_temp_degK_3', 'min_temp_degK_3',
                     'tot_precip_m_3', 'mean_wind_mps_3')

band_met5_feats <- c('med_Blue_corr', 'med_Green_corr', 'med_Red_corr', 'med_Nir_corr',
                     'RN', 'BG', 'RB','GB',
                     'tot_sol_rad_KJpm2_5', 'max_temp_degK_5', 'mean_temp_degK_5', 'min_temp_degK_5',
                     'tot_precip_m_5', 'mean_wind_mps_5')

band_met51_feats <- c('med_Blue_corr', 'med_Green_corr', 'med_Red_corr', 'med_Nir_corr',
                     'RN', 'BG', 'RB','GB',
                     'tot_sol_rad_KJpm2_5', 'max_temp_degK_5', 'mean_temp_degK_5', 'min_temp_degK_5',
                     'tot_precip_m_5', 'mean_wind_mps_5',
                     'solar_rad_KJpm2_prev', 'precip_m_prev','air_temp_degK_prev','wind_speed_mps_prev')

band_met71_feats <- c('med_Blue_corr', 'med_Green_corr', 'med_Red_corr', 'med_Nir_corr',
                     'RN', 'BG', 'RB','GB',
                     'tot_sol_rad_KJpm2_7', 'max_temp_degK_7', 'mean_temp_degK_7', 'min_temp_degK_7',
                     'tot_precip_m_7', 'mean_wind_mps_7',
                     'solar_rad_KJpm2_prev', 'precip_m_prev','air_temp_degK_prev','wind_speed_mps_prev')

Format data for xgboost

3 Day Window

## 3 day window, 3 days previous met
dtrain_3d_3m <- xgb.DMatrix(data = as.matrix(train_3[,band_met3_feats]), 
                            label = train_3[,target],
                            weight = train_3$weight)

dtest_3d_3m <- xgb.DMatrix(data = as.matrix(test_3[,band_met3_feats]), 
                     label = test_3[,target],
                            weight = test_3$weight)

dval_3d_3m <- xgb.DMatrix(data = as.matrix(val_3[,band_met3_feats]), 
                     label = val_3[,target],
                            weight = val_3$weight)

## 5 day window, 5 days previous met
dtrain_3d_5m <- xgb.DMatrix(data = as.matrix(train_3[,band_met5_feats]), 
                            label = train_3[,target],
                            weight = train_3$weight)

dtest_3d_5m <- xgb.DMatrix(data = as.matrix(test_3[,band_met5_feats]), 
                     label = test_3[,target],
                            weight = test_3$weight)

dval_3d_5m <- xgb.DMatrix(data = as.matrix(val_3[,band_met5_feats]), 
                     label = val_3[,target],
                            weight = val_3$weight)

## 3 day window, 5/1 days previous met
dtrain_3d_51m <- xgb.DMatrix(data = as.matrix(train_3[,band_met51_feats]), 
                            label = train_3[,target])

dtest_3d_51m <- xgb.DMatrix(data = as.matrix(test_3[,band_met51_feats]), 
                     label = test_3[,target])

dval_3d_51m <- xgb.DMatrix(data = as.matrix(val_3[,band_met51_feats]), 
                     label = val_3[,target])

## 5 day window, 7/1 days previous met
dtrain_3d_71m <- xgb.DMatrix(data = as.matrix(train_3[,band_met71_feats]), 
                            label = train_3[,target],
                            weight = train_3$weight)

dtest_3d_71m <- xgb.DMatrix(data = as.matrix(test_3[,band_met71_feats]), 
                     label = test_3[,target],
                            weight = test_3$weight)

dval_3d_71m <- xgb.DMatrix(data = as.matrix(val_3[,band_met71_feats]), 
                     label = val_3[,target],
                            weight = val_3$weight)

5 Day Window

## 5 day window, 3 days previous met
dtrain_5d_3m <- xgb.DMatrix(data = as.matrix(train_5[,band_met3_feats]), 
                            label = train_5[,target],
                            weight = train_5$weight)

dtest_5d_3m <- xgb.DMatrix(data = as.matrix(test_5[,band_met3_feats]), 
                     label = test_5[,target],
                            weight = test_5$weight)

dval_5d_3m <- xgb.DMatrix(data = as.matrix(val_5[,band_met3_feats]), 
                     label = val_5[,target],
                            weight = val_5$weight)

## 5 day window, 5 days previous met
dtrain_5d_5m <- xgb.DMatrix(data = as.matrix(train_5[,band_met5_feats]), 
                            label = train_5[,target],
                            weight = train_5$weight)

dtest_5d_5m <- xgb.DMatrix(data = as.matrix(test_5[,band_met5_feats]), 
                     label = test_5[,target],
                            weight = test_5$weight)

dval_5d_5m <- xgb.DMatrix(data = as.matrix(val_5[,band_met5_feats]), 
                     label = val_5[,target],
                            weight = val_5$weight)

## 5 day window, 5/1 days previous met
dtrain_5d_51m <- xgb.DMatrix(data = as.matrix(train_5[,band_met51_feats]), 
                            label = train_5[,target])

dtest_5d_51m <- xgb.DMatrix(data = as.matrix(test_5[,band_met51_feats]), 
                     label = test_5[,target])

dval_5d_51m <- xgb.DMatrix(data = as.matrix(val_5[,band_met51_feats]), 
                     label = val_5[,target])

## 5 day window, 7/1 days previous met
dtrain_5d_71m <- xgb.DMatrix(data = as.matrix(train_5[,band_met71_feats]), 
                            label = train_5[,target],
                            weight = train_5$weight)

dtest_5d_71m <- xgb.DMatrix(data = as.matrix(test_5[,band_met71_feats]), 
                     label = test_5[,target],
                            weight = test_5$weight)

dval_5d_71m <- xgb.DMatrix(data = as.matrix(val_5[,band_met71_feats]), 
                     label = val_5[,target],
                            weight = val_5$weight)

Local Knowledge Window

## jemma window, 5 days previous met
dtrain_jd_5m <- xgb.DMatrix(data = as.matrix(train_j[,band_met5_feats]), 
                            label = train_j[,target],
                            weight = train_j$weight)

dtest_jd_5m <- xgb.DMatrix(data = as.matrix(test_j[,band_met5_feats]), 
                     label = test_j[,target],
                            weight = test_j$weight)

dval_jd_5m <- xgb.DMatrix(data = as.matrix(val_j[,band_met5_feats]), 
                     label = val_j[,target],
                            weight = val_j$weight)

## jemma window, 3 days previous met
dtrain_jd_3m <- xgb.DMatrix(data = as.matrix(train_j[,band_met3_feats]), 
                            label = train_j[,target],
                            weight = train_j$weight)

dtest_jd_3m <- xgb.DMatrix(data = as.matrix(test_j[,band_met3_feats]), 
                     label = test_j[,target],
                            weight = test_j$weight)

dval_jd_3m <- xgb.DMatrix(data = as.matrix(val_j[,band_met3_feats]), 
                     label = val_j[,target],
                            weight = val_j$weight)

## jemma window, 5/1 days previous met
dtrain_jd_51m <- xgb.DMatrix(data = as.matrix(train_j[,band_met51_feats]), 
                            label = train_j[,target])

dtest_jd_51m <- xgb.DMatrix(data = as.matrix(test_j[,band_met51_feats]), 
                     label = test_j[,target])

dval_jd_51m <- xgb.DMatrix(data = as.matrix(val_j[,band_met51_feats]), 
                     label = val_j[,target])

## 5 day window, 7/1 days previous met
dtrain_jd_71m <- xgb.DMatrix(data = as.matrix(train_j[,band_met71_feats]), 
                            label = train_j[,target],
                            weight = train_j$weight)

dtest_jd_71m <- xgb.DMatrix(data = as.matrix(test_j[,band_met71_feats]), 
                     label = test_j[,target],
                            weight = test_j$weight)

dval_jd_71m <- xgb.DMatrix(data = as.matrix(val_j[,band_met71_feats]), 
                     label = val_j[,target],
                            weight = val_j$weight)

Parameter optimization

This is an xgboost optimization method developed by Sam Sillen where you list many possible hyperparameter options and then create a matrix of all possible combinations and grab the top 20 performing combinations of hyperparameters by square error (our loss statistic).

# Hypertune xgboost parameters and save as 'best_params' 
grid_train <- expand.grid(
  max_depth= c(3,6,8),
  subsample = c(.5,.8,1),
  colsample_bytree= c(.5,.8,1),
  eta = c(0.1, 0.3),
  min_child_weight= c(3,5,7)
)

hypertune_xgboost = function(train,test, grid){
  params <- list(booster = "gbtree", objective = 'reg:squarederror', 
                 eta=grid$eta ,max_depth=grid$max_depth, 
                 min_child_weight=grid$min_child_weight,
                 subsample=grid$subsample, 
                 colsample_bytree=grid$colsample_bytree)
  xgb.naive <- xgb.train(params = params, data = train, nrounds = 1000, 
                         watchlist = list(train = train, val = test), 
                         verbose = 0,
                         early_stopping_rounds = 20)
  summary <- grid %>% mutate(val_loss = xgb.naive$best_score, best_message = xgb.naive$best_msg,
                             mod = list(xgb.naive))
  
  return(summary) 
}

3 day window, 3 day met hypertuning

Note, evaluation is turned off for these chunks as to not overwrite previous models and parameter tuning in next section.

## Hypertune xgboost 3 day window, 3 day met
xgboost_hypertune_3d_3m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_3d_3m,dtest_3d_3m,current)
  })

mod_summary_3d_3m <- xgboost_hypertune_3d_3m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_3d_3m <- xgboost_hypertune_3d_3m[xgboost_hypertune_3d_3m$val_loss==min(xgboost_hypertune_3d_3m$val_loss),]

save(mod_summary_3d_3m,best_mod_3d_3m, file = 'data/models/paramsxg_ep_ds_val_3d_3m.RData')

3 day window, 5 day met hypertuning

## Hypertune xgboost 3 day window, 5 day met
xgboost_hypertune_3d_5m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_3d_5m,dtest_3d_5m,current)
  })

mod_summary_3d_5m <- xgboost_hypertune_3d_5m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_3d_5m <- xgboost_hypertune_3d_5m[xgboost_hypertune_3d_5m$val_loss==min(xgboost_hypertune_3d_5m$val_loss),]

save(mod_summary_3d_5m,best_mod_3d_5m, file = 'data/models/paramsxg_ep_ds_val_3d_5m.RData')

3 day window, 5/1 day met hypertuning

## Hypertune xgboost 5 day window, 5/1 day met
xgboost_hypertune_3d_51m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_3d_51m,dtest_3d_51m,current)
  })

mod_summary_3d_51m <- xgboost_hypertune_3d_51m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_3d_51m <- xgboost_hypertune_3d_51m[xgboost_hypertune_3d_51m$val_loss==min(xgboost_hypertune_3d_51m$val_loss),]

save(mod_summary_3d_51m,best_mod_3d_51m, file = 'data/models/paramsxg_ep_ds_val_3d_51m.RData')

3 day window, 7/1 day met hypertuning

## Hypertune xgboost 5 day window, 7/1 day met
xgboost_hypertune_3d_71m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_3d_71m,dtest_3d_71m,current)
  })

mod_summary_3d_71m <- xgboost_hypertune_3d_71m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_3d_71m <- xgboost_hypertune_3d_71m[xgboost_hypertune_3d_71m$val_loss==min(xgboost_hypertune_3d_71m$val_loss),]

save(mod_summary_3d_71m,best_mod_3d_71m, file = 'data/models/paramsxg_ep_ds_val_3d_71m.RData')

5 day window, 3 day met hypertuning

## Hypertune xgboost 5 day window, 3 day met
xgboost_hypertune_5d_3m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_5d_3m,dtest_5d_3m,current)
  })

mod_summary_5d_3m <- xgboost_hypertune_5d_3m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_5d_3m <- xgboost_hypertune_5d_3m[xgboost_hypertune_5d_3m$val_loss==min(xgboost_hypertune_5d_3m$val_loss),]

save(mod_summary_5d_3m,best_mod_5d_3m, file = 'data/models/paramsxg_ep_ds_val_5d_3m.RData')

5 day window, 5 day met hypertuning

## Hypertune xgboost 5 day window, 5 day met
xgboost_hypertune_5d_5m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_5d_5m,dtest_5d_5m,current)
  })

mod_summary_5d_5m <- xgboost_hypertune_5d_5m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_5d_5m <- xgboost_hypertune_5d_5m[xgboost_hypertune_5d_5m$val_loss==min(xgboost_hypertune_5d_5m$val_loss),]

save(mod_summary_5d_5m,best_mod_5d_5m, file = 'data/models/paramsxg_ep_ds_val_5d_5m.RData')

5 day window, 5/1 day met hypertuning

## Hypertune xgboost 5 day window, 5/1 day met
xgboost_hypertune_5d_51m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_5d_51m,dtest_5d_51m,current)
  })

mod_summary_5d_51m <- xgboost_hypertune_5d_51m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_5d_51m <- xgboost_hypertune_5d_51m[xgboost_hypertune_5d_51m$val_loss==min(xgboost_hypertune_5d_51m$val_loss),]

save(mod_summary_5d_51m,best_mod_5d_51m, file = 'data/models/paramsxg_ep_ds_val_5d_51m.RData')

5 day window, 7/1 day met hypertuning

## Hypertune xgboost 5 day window, 7/1 day met
xgboost_hypertune_5d_71m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_5d_71m,dtest_5d_71m,current)
  })

mod_summary_5d_71m <- xgboost_hypertune_5d_71m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_5d_71m <- xgboost_hypertune_5d_71m[xgboost_hypertune_5d_71m$val_loss==min(xgboost_hypertune_5d_71m$val_loss),]

save(mod_summary_5d_71m,best_mod_5d_71m, file = 'data/models/paramsxg_ep_ds_val_5d_71m.RData')

Jemma-special, 3 day met

## Hypertune xgboost jemma window, 3 day met
xgboost_hypertune_jd_3m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_jd_3m,dtest_jd_3m,current)
  })

mod_summary_jd_3m <- xgboost_hypertune_jd_3m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_jd_3m <- xgboost_hypertune_jd_3m[xgboost_hypertune_jd_3m$val_loss==min(xgboost_hypertune_jd_3m$val_loss),]

save(mod_summary_jd_3m,best_mod_jd_3m, file = 'data/models/paramsxg_ep_ds_val_jd_3m.RData')

Jemma-special, 5 day met

## Hypertune xgboost jemma window, 5 day met
xgboost_hypertune_jd_5m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_jd_5m,dtest_jd_5m,current)
  })

mod_summary_jd_5m <- xgboost_hypertune_jd_5m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_jd_5m <- xgboost_hypertune_jd_5m[xgboost_hypertune_jd_5m$val_loss==min(xgboost_hypertune_jd_5m$val_loss),]

save(mod_summary_jd_5m,best_mod_jd_5m, file = 'data/models/paramsxg_ep_ds_val_jd_5m.RData')

Jemma-special, 5/1 day met

## Hypertune xgboost jemma window, 5/1 day met
xgboost_hypertune_jd_51m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_jd_51m,dtest_jd_51m,current)
  })

mod_summary_jd_51m <- xgboost_hypertune_jd_51m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_jd_51m <- xgboost_hypertune_jd_51m[xgboost_hypertune_jd_51m$val_loss==min(xgboost_hypertune_jd_51m$val_loss),]

save(mod_summary_jd_51m,best_mod_jd_51m, file = 'data/models/paramsxg_ep_ds_val_jd_51m.RData')

Jemma-special, 7/1 day met

## Hypertune xgboost jemma window, 5 day met
xgboost_hypertune_jd_71m <- grid_train %>%
  pmap_dfr(function(...) {
    current <- tibble(...)
    hypertune_xgboost(dtrain_jd_71m,dtest_jd_71m,current)
  })

mod_summary_jd_71m <- xgboost_hypertune_jd_71m %>% 
  arrange(val_loss) %>%
  dplyr::slice(1:20)

best_mod_jd_71m <- xgboost_hypertune_jd_71m[xgboost_hypertune_jd_71m$val_loss==min(xgboost_hypertune_jd_71m$val_loss),]

save(mod_summary_jd_71m,best_mod_jd_71m, file = 'data/models/paramsxg_ep_ds_val_jd_71m.RData')

Model Assessment and Application

load model summaries

load('data/models/paramsxg_ep_ds_val_3d_5m.RData')
load('data/models/paramsxg_ep_ds_val_3d_3m.RData')
load('data/models/paramsxg_ep_ds_val_3d_51m.RData')
load('data/models/paramsxg_ep_ds_val_3d_71m.RData')
load('data/models/paramsxg_ep_ds_val_5d_5m.RData')
load('data/models/paramsxg_ep_ds_val_5d_3m.RData')
load('data/models/paramsxg_ep_ds_val_5d_51m.RData')
load('data/models/paramsxg_ep_ds_val_5d_71m.RData')
load('data/models/paramsxg_ep_ds_val_jd_5m.RData')
load('data/models/paramsxg_ep_ds_val_jd_3m.RData')
load('data/models/paramsxg_ep_ds_val_jd_51m.RData')
load('data/models/paramsxg_ep_ds_val_jd_71m.RData')

Now that these are loaded, we need to look at the test/train statistics. Ideally the train/test RMSE are relatively close so we don’t choose too overfit of a model. Below, we apply the model to the validation dataset and plot the validation observed versus predicted.

The optimized booster was chosen as the highest-ranked booster where the train/test were within 0.15m RMSE. If no booster met that, the booster with the closest train/test RMSE was chosen.

Three day window dataset

mod_summary_3d_3m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                   
FALSE    <chr>                                          
FALSE  1 "[10]\ttrain-rmse:0.477587\tval-rmse:1.061289" 
FALSE  2 "[21]\ttrain-rmse:0.194911\tval-rmse:1.107435" 
FALSE  3 "[19]\ttrain-rmse:0.268642\tval-rmse:1.122342" 
FALSE  4 "[134]\ttrain-rmse:0.000295\tval-rmse:1.134900"
FALSE  5 "[334]\ttrain-rmse:0.000665\tval-rmse:1.142065"
FALSE  6 "[11]\ttrain-rmse:0.382840\tval-rmse:1.144331" 
FALSE  7 "[10]\ttrain-rmse:0.343416\tval-rmse:1.153642" 
FALSE  8 "[10]\ttrain-rmse:0.455634\tval-rmse:1.153768" 
FALSE  9 "[75]\ttrain-rmse:0.000653\tval-rmse:1.154036" 
FALSE 10 "[15]\ttrain-rmse:0.410208\tval-rmse:1.155701" 
FALSE 11 "[46]\ttrain-rmse:0.205879\tval-rmse:1.158334" 
FALSE 12 "[32]\ttrain-rmse:0.159866\tval-rmse:1.159084" 
FALSE 13 "[76]\ttrain-rmse:0.014075\tval-rmse:1.159845" 
FALSE 14 "[26]\ttrain-rmse:0.518273\tval-rmse:1.161326" 
FALSE 15 "[21]\ttrain-rmse:0.027639\tval-rmse:1.164237" 
FALSE 16 "[13]\ttrain-rmse:0.128857\tval-rmse:1.166551" 
FALSE 17 "[107]\ttrain-rmse:0.071307\tval-rmse:1.168489"
FALSE 18 "[49]\ttrain-rmse:0.330814\tval-rmse:1.168865" 
FALSE 19 "[291]\ttrain-rmse:0.000891\tval-rmse:1.168997"
FALSE 20 "[34]\ttrain-rmse:0.025951\tval-rmse:1.171079"
mod_summary_3d_5m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                   
FALSE    <chr>                                          
FALSE  1 "[8]\ttrain-rmse:0.515746\tval-rmse:1.195269"  
FALSE  2 "[19]\ttrain-rmse:0.020525\tval-rmse:1.195724" 
FALSE  3 "[19]\ttrain-rmse:0.194365\tval-rmse:1.199597" 
FALSE  4 "[9]\ttrain-rmse:0.455980\tval-rmse:1.232620"  
FALSE  5 "[10]\ttrain-rmse:0.372991\tval-rmse:1.243468" 
FALSE  6 "[98]\ttrain-rmse:0.000394\tval-rmse:1.249451" 
FALSE  7 "[142]\ttrain-rmse:0.005179\tval-rmse:1.253825"
FALSE  8 "[20]\ttrain-rmse:0.086589\tval-rmse:1.254019" 
FALSE  9 "[29]\ttrain-rmse:0.482150\tval-rmse:1.255012" 
FALSE 10 "[12]\ttrain-rmse:0.464967\tval-rmse:1.264834" 
FALSE 11 "[16]\ttrain-rmse:0.372073\tval-rmse:1.267269" 
FALSE 12 "[89]\ttrain-rmse:0.039772\tval-rmse:1.278646" 
FALSE 13 "[44]\ttrain-rmse:0.009665\tval-rmse:1.282161" 
FALSE 14 "[36]\ttrain-rmse:0.403999\tval-rmse:1.283408" 
FALSE 15 "[38]\ttrain-rmse:0.245275\tval-rmse:1.290493" 
FALSE 16 "[8]\ttrain-rmse:0.430535\tval-rmse:1.293000"  
FALSE 17 "[99]\ttrain-rmse:0.008901\tval-rmse:1.294168" 
FALSE 18 "[20]\ttrain-rmse:0.096681\tval-rmse:1.299647" 
FALSE 19 "[11]\ttrain-rmse:0.455698\tval-rmse:1.301085" 
FALSE 20 "[9]\ttrain-rmse:0.392661\tval-rmse:1.301779"
mod_summary_3d_51m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                   
FALSE    <chr>                                          
FALSE  1 "[13]\ttrain-rmse:0.400027\tval-rmse:1.099498" 
FALSE  2 "[112]\ttrain-rmse:0.001016\tval-rmse:1.101700"
FALSE  3 "[57]\ttrain-rmse:0.034285\tval-rmse:1.112129" 
FALSE  4 "[52]\ttrain-rmse:0.205842\tval-rmse:1.118093" 
FALSE  5 "[9]\ttrain-rmse:0.463851\tval-rmse:1.120178"  
FALSE  6 "[9]\ttrain-rmse:0.438791\tval-rmse:1.122075"  
FALSE  7 "[8]\ttrain-rmse:0.510232\tval-rmse:1.122775"  
FALSE  8 "[8]\ttrain-rmse:0.559591\tval-rmse:1.124419"  
FALSE  9 "[8]\ttrain-rmse:0.429651\tval-rmse:1.130044"  
FALSE 10 "[9]\ttrain-rmse:0.366573\tval-rmse:1.131433"  
FALSE 11 "[35]\ttrain-rmse:0.258252\tval-rmse:1.131860" 
FALSE 12 "[31]\ttrain-rmse:0.343238\tval-rmse:1.132070" 
FALSE 13 "[11]\ttrain-rmse:0.282591\tval-rmse:1.135783" 
FALSE 14 "[9]\ttrain-rmse:0.505418\tval-rmse:1.136712"  
FALSE 15 "[72]\ttrain-rmse:0.113127\tval-rmse:1.139506" 
FALSE 16 "[12]\ttrain-rmse:0.453466\tval-rmse:1.143642" 
FALSE 17 "[37]\ttrain-rmse:0.181998\tval-rmse:1.145793" 
FALSE 18 "[43]\ttrain-rmse:0.218057\tval-rmse:1.148687" 
FALSE 19 "[33]\ttrain-rmse:0.266122\tval-rmse:1.149218" 
FALSE 20 "[9]\ttrain-rmse:0.427015\tval-rmse:1.149506"
mod_summary_3d_71m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                   
FALSE    <chr>                                          
FALSE  1 "[36]\ttrain-rmse:0.062589\tval-rmse:0.975806" 
FALSE  2 "[49]\ttrain-rmse:0.014366\tval-rmse:1.028137" 
FALSE  3 "[31]\ttrain-rmse:0.226747\tval-rmse:1.053164" 
FALSE  4 "[24]\ttrain-rmse:0.041203\tval-rmse:1.055632" 
FALSE  5 "[296]\ttrain-rmse:0.000632\tval-rmse:1.056838"
FALSE  6 "[54]\ttrain-rmse:0.051681\tval-rmse:1.063167" 
FALSE  7 "[23]\ttrain-rmse:0.170265\tval-rmse:1.070594" 
FALSE  8 "[281]\ttrain-rmse:0.000511\tval-rmse:1.091091"
FALSE  9 "[222]\ttrain-rmse:0.004552\tval-rmse:1.095487"
FALSE 10 "[43]\ttrain-rmse:0.288927\tval-rmse:1.101155" 
FALSE 11 "[140]\ttrain-rmse:0.011462\tval-rmse:1.104318"
FALSE 12 "[10]\ttrain-rmse:0.179988\tval-rmse:1.112360" 
FALSE 13 "[226]\ttrain-rmse:0.006312\tval-rmse:1.121170"
FALSE 14 "[47]\ttrain-rmse:0.114646\tval-rmse:1.122887" 
FALSE 15 "[9]\ttrain-rmse:0.417078\tval-rmse:1.125053"  
FALSE 16 "[278]\ttrain-rmse:0.000586\tval-rmse:1.126514"
FALSE 17 "[9]\ttrain-rmse:0.373570\tval-rmse:1.130506"  
FALSE 18 "[15]\ttrain-rmse:0.108924\tval-rmse:1.132513" 
FALSE 19 "[17]\ttrain-rmse:0.086053\tval-rmse:1.132769" 
FALSE 20 "[150]\ttrain-rmse:0.005810\tval-rmse:1.133881"
# most of best models are overfit, so looking for train/test RMSE that are closer
optimized_booster_3d_3m <- mod_summary_3d_3m$mod[1][[1]]
optimized_booster_3d_5m <- mod_summary_3d_5m$mod[1][[1]]
optimized_booster_3d_51m <- mod_summary_3d_51m$mod[8][[1]]
optimized_booster_3d_71m <- mod_summary_3d_71m$mod[15][[1]]
#note that pretty much all of these are overfit.

# Apply best mod
preds_3 <- val_3 %>% 
  mutate(pred_secchi_3d_5m = predict(optimized_booster_3d_5m, dval_3d_5m),
         pred_secchi_3d_3m = predict(optimized_booster_3d_3m, dval_3d_3m),
         pred_secchi_3d_51m = predict(optimized_booster_3d_51m, dval_3d_51m),
         pred_secchi_3d_71m = predict(optimized_booster_3d_71m, dval_3d_71m))

evals_3 <- preds_3 %>%
  summarise(across(c(pred_secchi_3d_3m, pred_secchi_3d_5m, pred_secchi_3d_71m, pred_secchi_3d_51m),
                   list(rmse = ~rmse(secchi, .),
                        mae = ~mae(secchi, .),
                        mape = ~mape(secchi, .),
                        bias = ~bias(secchi, .),
                        p.bias = ~percent_bias(secchi, .),
                        smape = ~smape(secchi, .),
                        r2 = ~cor(secchi, .)^2), 
                   .names = "{fn}_{col}"))

evals_3
FALSE   rmse_pred_secchi_3d_3m mae_pred_secchi_3d_3m mape_pred_secchi_3d_3m
FALSE 1               1.204984             0.8107306              0.2454355
FALSE   bias_pred_secchi_3d_3m p.bias_pred_secchi_3d_3m smape_pred_secchi_3d_3m
FALSE 1              0.2311832              -0.01734229               0.2423958
FALSE   r2_pred_secchi_3d_3m rmse_pred_secchi_3d_5m mae_pred_secchi_3d_5m
FALSE 1            0.2835168               1.146213             0.7579876
FALSE   mape_pred_secchi_3d_5m bias_pred_secchi_3d_5m p.bias_pred_secchi_3d_5m
FALSE 1              0.2353272              0.1463488              -0.03809979
FALSE   smape_pred_secchi_3d_5m r2_pred_secchi_3d_5m rmse_pred_secchi_3d_71m
FALSE 1                0.223505            0.3381305                1.206261
FALSE   mae_pred_secchi_3d_71m mape_pred_secchi_3d_71m bias_pred_secchi_3d_71m
FALSE 1               0.818684               0.2518711               0.3330314
FALSE   p.bias_pred_secchi_3d_71m smape_pred_secchi_3d_71m r2_pred_secchi_3d_71m
FALSE 1                  0.021072                0.2564194             0.3157539
FALSE   rmse_pred_secchi_3d_51m mae_pred_secchi_3d_51m mape_pred_secchi_3d_51m
FALSE 1                1.231972              0.8388924               0.2444665
FALSE   bias_pred_secchi_3d_51m p.bias_pred_secchi_3d_51m smape_pred_secchi_3d_51m
FALSE 1               0.4422491                0.04573491                0.2520182
FALSE   r2_pred_secchi_3d_51m
FALSE 1             0.3413099

Five day window dataset

mod_summary_5d_3m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                  
FALSE    <chr>                                         
FALSE  1 "[22]\ttrain-rmse:0.041155\tval-rmse:1.474127"
FALSE  2 "[15]\ttrain-rmse:0.455179\tval-rmse:1.479925"
FALSE  3 "[17]\ttrain-rmse:0.125483\tval-rmse:1.513831"
FALSE  4 "[13]\ttrain-rmse:0.472533\tval-rmse:1.534639"
FALSE  5 "[9]\ttrain-rmse:0.511468\tval-rmse:1.548024" 
FALSE  6 "[31]\ttrain-rmse:0.515173\tval-rmse:1.567495"
FALSE  7 "[77]\ttrain-rmse:0.113023\tval-rmse:1.576125"
FALSE  8 "[16]\ttrain-rmse:0.422095\tval-rmse:1.590905"
FALSE  9 "[12]\ttrain-rmse:0.172194\tval-rmse:1.598149"
FALSE 10 "[16]\ttrain-rmse:0.038247\tval-rmse:1.601600"
FALSE 11 "[16]\ttrain-rmse:0.286907\tval-rmse:1.602675"
FALSE 12 "[14]\ttrain-rmse:0.476075\tval-rmse:1.603160"
FALSE 13 "[36]\ttrain-rmse:0.453115\tval-rmse:1.603734"
FALSE 14 "[7]\ttrain-rmse:0.648907\tval-rmse:1.608385" 
FALSE 15 "[16]\ttrain-rmse:0.371636\tval-rmse:1.611723"
FALSE 16 "[79]\ttrain-rmse:0.011788\tval-rmse:1.613913"
FALSE 17 "[44]\ttrain-rmse:0.258767\tval-rmse:1.614569"
FALSE 18 "[66]\ttrain-rmse:0.015719\tval-rmse:1.616120"
FALSE 19 "[22]\ttrain-rmse:0.049156\tval-rmse:1.617053"
FALSE 20 "[39]\ttrain-rmse:0.278643\tval-rmse:1.623845"
mod_summary_5d_5m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                  
FALSE    <chr>                                         
FALSE  1 "[14]\ttrain-rmse:0.288309\tval-rmse:1.284465"
FALSE  2 "[13]\ttrain-rmse:0.113564\tval-rmse:1.338978"
FALSE  3 "[38]\ttrain-rmse:0.002466\tval-rmse:1.348758"
FALSE  4 "[22]\ttrain-rmse:0.052921\tval-rmse:1.375349"
FALSE  5 "[27]\ttrain-rmse:0.048802\tval-rmse:1.379506"
FALSE  6 "[82]\ttrain-rmse:0.007752\tval-rmse:1.382302"
FALSE  7 "[17]\ttrain-rmse:0.092878\tval-rmse:1.391614"
FALSE  8 "[13]\ttrain-rmse:0.369755\tval-rmse:1.406061"
FALSE  9 "[67]\ttrain-rmse:0.048342\tval-rmse:1.410505"
FALSE 10 "[17]\ttrain-rmse:0.335241\tval-rmse:1.412208"
FALSE 11 "[21]\ttrain-rmse:0.030524\tval-rmse:1.417177"
FALSE 12 "[81]\ttrain-rmse:0.013327\tval-rmse:1.417764"
FALSE 13 "[52]\ttrain-rmse:0.047915\tval-rmse:1.425982"
FALSE 14 "[69]\ttrain-rmse:0.023813\tval-rmse:1.430881"
FALSE 15 "[16]\ttrain-rmse:0.261122\tval-rmse:1.432051"
FALSE 16 "[17]\ttrain-rmse:0.418284\tval-rmse:1.435247"
FALSE 17 "[10]\ttrain-rmse:0.357821\tval-rmse:1.435502"
FALSE 18 "[57]\ttrain-rmse:0.053306\tval-rmse:1.443999"
FALSE 19 "[13]\ttrain-rmse:0.165044\tval-rmse:1.444481"
FALSE 20 "[73]\ttrain-rmse:0.033954\tval-rmse:1.447306"
mod_summary_5d_51m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                   
FALSE    <chr>                                          
FALSE  1 "[46]\ttrain-rmse:0.218048\tval-rmse:1.013079" 
FALSE  2 "[88]\ttrain-rmse:0.064346\tval-rmse:1.017276" 
FALSE  3 "[36]\ttrain-rmse:0.088009\tval-rmse:1.034381" 
FALSE  4 "[17]\ttrain-rmse:0.314965\tval-rmse:1.044706" 
FALSE  5 "[17]\ttrain-rmse:0.171126\tval-rmse:1.065627" 
FALSE  6 "[69]\ttrain-rmse:0.059558\tval-rmse:1.085671" 
FALSE  7 "[60]\ttrain-rmse:0.306046\tval-rmse:1.086301" 
FALSE  8 "[6]\ttrain-rmse:0.728980\tval-rmse:1.093971"  
FALSE  9 "[72]\ttrain-rmse:0.149041\tval-rmse:1.096494" 
FALSE 10 "[145]\ttrain-rmse:0.216789\tval-rmse:1.098977"
FALSE 11 "[45]\ttrain-rmse:0.084980\tval-rmse:1.100158" 
FALSE 12 "[32]\ttrain-rmse:0.405424\tval-rmse:1.102863" 
FALSE 13 "[32]\ttrain-rmse:0.181356\tval-rmse:1.103312" 
FALSE 14 "[12]\ttrain-rmse:0.202233\tval-rmse:1.104636" 
FALSE 15 "[18]\ttrain-rmse:0.225423\tval-rmse:1.106117" 
FALSE 16 "[9]\ttrain-rmse:0.471254\tval-rmse:1.106718"  
FALSE 17 "[75]\ttrain-rmse:0.332806\tval-rmse:1.107193" 
FALSE 18 "[89]\ttrain-rmse:0.014922\tval-rmse:1.107607" 
FALSE 19 "[79]\ttrain-rmse:0.112648\tval-rmse:1.108478" 
FALSE 20 "[22]\ttrain-rmse:0.294097\tval-rmse:1.113070"
mod_summary_5d_71m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                   
FALSE    <chr>                                          
FALSE  1 "[30]\ttrain-rmse:0.015034\tval-rmse:1.419342" 
FALSE  2 "[53]\ttrain-rmse:0.100595\tval-rmse:1.439041" 
FALSE  3 "[5]\ttrain-rmse:0.804013\tval-rmse:1.455937"  
FALSE  4 "[75]\ttrain-rmse:0.127392\tval-rmse:1.500577" 
FALSE  5 "[70]\ttrain-rmse:0.183777\tval-rmse:1.508244" 
FALSE  6 "[87]\ttrain-rmse:0.024247\tval-rmse:1.511966" 
FALSE  7 "[99]\ttrain-rmse:0.227587\tval-rmse:1.523988" 
FALSE  8 "[10]\ttrain-rmse:0.356757\tval-rmse:1.524829" 
FALSE  9 "[71]\ttrain-rmse:0.023370\tval-rmse:1.533075" 
FALSE 10 "[17]\ttrain-rmse:0.069744\tval-rmse:1.539453" 
FALSE 11 "[47]\ttrain-rmse:0.200225\tval-rmse:1.540581" 
FALSE 12 "[41]\ttrain-rmse:0.009065\tval-rmse:1.540768" 
FALSE 13 "[64]\ttrain-rmse:0.249842\tval-rmse:1.544079" 
FALSE 14 "[72]\ttrain-rmse:0.055702\tval-rmse:1.544483" 
FALSE 15 "[14]\ttrain-rmse:0.359285\tval-rmse:1.546591" 
FALSE 16 "[65]\ttrain-rmse:0.118521\tval-rmse:1.546699" 
FALSE 17 "[67]\ttrain-rmse:0.265781\tval-rmse:1.553425" 
FALSE 18 "[80]\ttrain-rmse:0.267681\tval-rmse:1.556340" 
FALSE 19 "[175]\ttrain-rmse:0.017735\tval-rmse:1.557727"
FALSE 20 "[98]\ttrain-rmse:0.185597\tval-rmse:1.560225"
# most of best models are overfit, so looking for train/test RMSE that are closer
optimized_booster_5d_3m <- mod_summary_5d_3m$mod[14][[1]]
optimized_booster_5d_5m <- mod_summary_5d_5m$mod[16][[1]]
optimized_booster_5d_51m <- mod_summary_5d_51m$mod[8][[1]]
optimized_booster_5d_71m <- mod_summary_5d_71m$mod[3][[1]]

# Apply best mod
preds_5 <- val_5 %>% 
  mutate(pred_secchi_5d_5m = predict(optimized_booster_5d_5m, dval_5d_5m),
         pred_secchi_5d_3m = predict(optimized_booster_5d_3m, dval_5d_3m),
         pred_secchi_5d_51m = predict(optimized_booster_5d_51m, dval_5d_51m),
         pred_secchi_5d_71m = predict(optimized_booster_5d_71m, dval_5d_71m))

evals_5 <- preds_5 %>%
  summarise(across(c(pred_secchi_5d_3m, pred_secchi_5d_5m, pred_secchi_5d_71m,pred_secchi_5d_51m),
                   list(rmse = ~rmse(secchi, .),
                        mae = ~mae(secchi, .),
                        mape = ~mape(secchi, .),
                        bias = ~bias(secchi, .),
                        p.bias = ~percent_bias(secchi, .),
                        smape = ~smape(secchi, .),
                        r2 = ~cor(secchi, .)^2), 
                   .names = "{fn}_{col}"))

evals_5
FALSE   rmse_pred_secchi_5d_3m mae_pred_secchi_5d_3m mape_pred_secchi_5d_3m
FALSE 1              0.8687499             0.6876114               0.270092
FALSE   bias_pred_secchi_5d_3m p.bias_pred_secchi_5d_3m smape_pred_secchi_5d_3m
FALSE 1              -0.176083               -0.1463158               0.2338374
FALSE   r2_pred_secchi_5d_3m rmse_pred_secchi_5d_5m mae_pred_secchi_5d_5m
FALSE 1            0.5357969              0.9447434             0.8009287
FALSE   mape_pred_secchi_5d_5m bias_pred_secchi_5d_5m p.bias_pred_secchi_5d_5m
FALSE 1              0.3374968             -0.3855903               -0.2477381
FALSE   smape_pred_secchi_5d_5m r2_pred_secchi_5d_5m rmse_pred_secchi_5d_71m
FALSE 1               0.2792514            0.5273364                1.048708
FALSE   mae_pred_secchi_5d_71m mape_pred_secchi_5d_71m bias_pred_secchi_5d_71m
FALSE 1              0.8092424               0.2920258               0.1303768
FALSE   p.bias_pred_secchi_5d_71m smape_pred_secchi_5d_71m r2_pred_secchi_5d_71m
FALSE 1               -0.07309027                0.2722098             0.3023413
FALSE   rmse_pred_secchi_5d_51m mae_pred_secchi_5d_51m mape_pred_secchi_5d_51m
FALSE 1               0.9065162              0.7264437               0.2688716
FALSE   bias_pred_secchi_5d_51m p.bias_pred_secchi_5d_51m smape_pred_secchi_5d_51m
FALSE 1             0.006378763                -0.1013392                0.2526804
FALSE   r2_pred_secchi_5d_51m
FALSE 1             0.4804024

Jemma-special window dataset

mod_summary_jd_3m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                  
FALSE    <chr>                                         
FALSE  1 "[16]\ttrain-rmse:0.079467\tval-rmse:0.714188"
FALSE  2 "[59]\ttrain-rmse:0.014189\tval-rmse:0.714414"
FALSE  3 "[14]\ttrain-rmse:0.111040\tval-rmse:0.716807"
FALSE  4 "[12]\ttrain-rmse:0.358001\tval-rmse:0.727829"
FALSE  5 "[38]\ttrain-rmse:0.107724\tval-rmse:0.741570"
FALSE  6 "[36]\ttrain-rmse:0.179660\tval-rmse:0.752897"
FALSE  7 "[16]\ttrain-rmse:0.335474\tval-rmse:0.756676"
FALSE  8 "[45]\ttrain-rmse:0.329021\tval-rmse:0.766702"
FALSE  9 "[46]\ttrain-rmse:0.340604\tval-rmse:0.768102"
FALSE 10 "[9]\ttrain-rmse:0.462185\tval-rmse:0.769035" 
FALSE 11 "[36]\ttrain-rmse:0.148926\tval-rmse:0.772016"
FALSE 12 "[47]\ttrain-rmse:0.324067\tval-rmse:0.776433"
FALSE 13 "[12]\ttrain-rmse:0.411067\tval-rmse:0.780482"
FALSE 14 "[41]\ttrain-rmse:0.365920\tval-rmse:0.786743"
FALSE 15 "[13]\ttrain-rmse:0.061643\tval-rmse:0.786832"
FALSE 16 "[9]\ttrain-rmse:0.463423\tval-rmse:0.788203" 
FALSE 17 "[6]\ttrain-rmse:0.738932\tval-rmse:0.794452" 
FALSE 18 "[62]\ttrain-rmse:0.269829\tval-rmse:0.795906"
FALSE 19 "[13]\ttrain-rmse:0.112359\tval-rmse:0.797685"
FALSE 20 "[37]\ttrain-rmse:0.144154\tval-rmse:0.802158"
mod_summary_jd_5m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                  
FALSE    <chr>                                         
FALSE  1 "[34]\ttrain-rmse:0.177041\tval-rmse:0.748675"
FALSE  2 "[28]\ttrain-rmse:0.309177\tval-rmse:0.763568"
FALSE  3 "[18]\ttrain-rmse:0.051808\tval-rmse:0.775033"
FALSE  4 "[41]\ttrain-rmse:0.136158\tval-rmse:0.775581"
FALSE  5 "[12]\ttrain-rmse:0.100908\tval-rmse:0.787784"
FALSE  6 "[49]\ttrain-rmse:0.097162\tval-rmse:0.794204"
FALSE  7 "[24]\ttrain-rmse:0.413657\tval-rmse:0.795801"
FALSE  8 "[33]\ttrain-rmse:0.173532\tval-rmse:0.797295"
FALSE  9 "[56]\ttrain-rmse:0.035296\tval-rmse:0.804461"
FALSE 10 "[41]\ttrain-rmse:0.074995\tval-rmse:0.808864"
FALSE 11 "[24]\ttrain-rmse:0.245552\tval-rmse:0.810609"
FALSE 12 "[33]\ttrain-rmse:0.308492\tval-rmse:0.814132"
FALSE 13 "[39]\ttrain-rmse:0.120043\tval-rmse:0.818682"
FALSE 14 "[13]\ttrain-rmse:0.134302\tval-rmse:0.818960"
FALSE 15 "[19]\ttrain-rmse:0.279768\tval-rmse:0.820244"
FALSE 16 "[13]\ttrain-rmse:0.264919\tval-rmse:0.820439"
FALSE 17 "[7]\ttrain-rmse:0.418207\tval-rmse:0.824284" 
FALSE 18 "[8]\ttrain-rmse:0.526393\tval-rmse:0.825755" 
FALSE 19 "[45]\ttrain-rmse:0.055825\tval-rmse:0.827922"
FALSE 20 "[23]\ttrain-rmse:0.004523\tval-rmse:0.831956"
mod_summary_jd_51m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                  
FALSE    <chr>                                         
FALSE  1 "[11]\ttrain-rmse:0.540552\tval-rmse:0.677212"
FALSE  2 "[6]\ttrain-rmse:0.774070\tval-rmse:0.692034" 
FALSE  3 "[6]\ttrain-rmse:0.678798\tval-rmse:0.732886" 
FALSE  4 "[5]\ttrain-rmse:0.804580\tval-rmse:0.735036" 
FALSE  5 "[12]\ttrain-rmse:0.369406\tval-rmse:0.737677"
FALSE  6 "[25]\ttrain-rmse:0.403520\tval-rmse:0.738237"
FALSE  7 "[9]\ttrain-rmse:0.458656\tval-rmse:0.740189" 
FALSE  8 "[12]\ttrain-rmse:0.183252\tval-rmse:0.740674"
FALSE  9 "[24]\ttrain-rmse:0.471257\tval-rmse:0.750969"
FALSE 10 "[26]\ttrain-rmse:0.526289\tval-rmse:0.755764"
FALSE 11 "[9]\ttrain-rmse:0.313900\tval-rmse:0.756729" 
FALSE 12 "[28]\ttrain-rmse:0.339554\tval-rmse:0.759583"
FALSE 13 "[9]\ttrain-rmse:0.410149\tval-rmse:0.763569" 
FALSE 14 "[10]\ttrain-rmse:0.330390\tval-rmse:0.766693"
FALSE 15 "[32]\ttrain-rmse:0.283537\tval-rmse:0.774035"
FALSE 16 "[7]\ttrain-rmse:0.723469\tval-rmse:0.774735" 
FALSE 17 "[12]\ttrain-rmse:0.247526\tval-rmse:0.775796"
FALSE 18 "[26]\ttrain-rmse:0.536531\tval-rmse:0.776272"
FALSE 19 "[6]\ttrain-rmse:0.735207\tval-rmse:0.776751" 
FALSE 20 "[42]\ttrain-rmse:0.275810\tval-rmse:0.778111"
mod_summary_jd_71m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE    best_message                                  
FALSE    <chr>                                         
FALSE  1 "[18]\ttrain-rmse:0.103390\tval-rmse:0.737208"
FALSE  2 "[12]\ttrain-rmse:0.306107\tval-rmse:0.788309"
FALSE  3 "[8]\ttrain-rmse:0.499671\tval-rmse:0.797015" 
FALSE  4 "[19]\ttrain-rmse:0.062418\tval-rmse:0.800453"
FALSE  5 "[17]\ttrain-rmse:0.055128\tval-rmse:0.802865"
FALSE  6 "[50]\ttrain-rmse:0.287471\tval-rmse:0.806220"
FALSE  7 "[24]\ttrain-rmse:0.160493\tval-rmse:0.807828"
FALSE  8 "[19]\ttrain-rmse:0.296031\tval-rmse:0.812437"
FALSE  9 "[17]\ttrain-rmse:0.397541\tval-rmse:0.812698"
FALSE 10 "[43]\ttrain-rmse:0.353648\tval-rmse:0.819210"
FALSE 11 "[38]\ttrain-rmse:0.439128\tval-rmse:0.820094"
FALSE 12 "[61]\ttrain-rmse:0.032097\tval-rmse:0.820747"
FALSE 13 "[36]\ttrain-rmse:0.390634\tval-rmse:0.820827"
FALSE 14 "[31]\ttrain-rmse:0.341589\tval-rmse:0.824642"
FALSE 15 "[8]\ttrain-rmse:0.354879\tval-rmse:0.825006" 
FALSE 16 "[41]\ttrain-rmse:0.097777\tval-rmse:0.825346"
FALSE 17 "[56]\ttrain-rmse:0.170000\tval-rmse:0.831017"
FALSE 18 "[7]\ttrain-rmse:0.564719\tval-rmse:0.832583" 
FALSE 19 "[29]\ttrain-rmse:0.273936\tval-rmse:0.835093"
FALSE 20 "[27]\ttrain-rmse:0.314804\tval-rmse:0.835844"
optimized_booster_jd_3m <- mod_summary_jd_3m$mod[17][[1]]
optimized_booster_jd_5m <- mod_summary_jd_5m$mod[18][[1]]
optimized_booster_jd_51m <- mod_summary_jd_51m$mod[3][[1]]
optimized_booster_jd_71m <- mod_summary_jd_71m$mod[18][[1]]

# Apply best mod
preds_jd <- val_j %>% 
  mutate(pred_secchi_jd_5m = predict(optimized_booster_jd_5m, dval_jd_5m),
         pred_secchi_jd_3m = predict(optimized_booster_jd_3m, dval_jd_3m),
         pred_secchi_jd_51m = predict(optimized_booster_jd_51m, dval_jd_51m),
         pred_secchi_jd_71m = predict(optimized_booster_jd_71m, dval_jd_71m))

evals_jd <- preds_jd %>%
  summarise(across(c(pred_secchi_jd_3m, pred_secchi_jd_5m, pred_secchi_jd_71m, pred_secchi_jd_51m),
                   list(rmse = ~rmse(secchi, .),
                        mae = ~mae(secchi, .),
                        mape = ~mape(secchi, .),
                        bias = ~bias(secchi, .),
                        p.bias = ~percent_bias(secchi, .),
                        smape = ~smape(secchi, .),
                        r2 = ~cor(secchi, .)^2), 
                   .names = "{fn}_{col}"))

evals_jd
FALSE   rmse_pred_secchi_jd_3m mae_pred_secchi_jd_3m mape_pred_secchi_jd_3m
FALSE 1               1.295863             0.8484592              0.2952314
FALSE   bias_pred_secchi_jd_3m p.bias_pred_secchi_jd_3m smape_pred_secchi_jd_3m
FALSE 1             0.07228615              -0.09536324               0.2602842
FALSE   r2_pred_secchi_jd_3m rmse_pred_secchi_jd_5m mae_pred_secchi_jd_5m
FALSE 1           0.03203755                1.15185             0.7514447
FALSE   mape_pred_secchi_jd_5m bias_pred_secchi_jd_5m p.bias_pred_secchi_jd_5m
FALSE 1              0.2695587            -0.01317536               -0.1127845
FALSE   smape_pred_secchi_jd_5m r2_pred_secchi_jd_5m rmse_pred_secchi_jd_71m
FALSE 1               0.2344183            0.2136125                1.283404
FALSE   mae_pred_secchi_jd_71m mape_pred_secchi_jd_71m bias_pred_secchi_jd_71m
FALSE 1              0.7946067               0.2551866               0.1577832
FALSE   p.bias_pred_secchi_jd_71m smape_pred_secchi_jd_71m r2_pred_secchi_jd_71m
FALSE 1               -0.05186786                0.2450416            0.04438589
FALSE   rmse_pred_secchi_jd_51m mae_pred_secchi_jd_51m mape_pred_secchi_jd_51m
FALSE 1                1.147571              0.7108967               0.2166104
FALSE   bias_pred_secchi_jd_51m p.bias_pred_secchi_jd_51m smape_pred_secchi_jd_51m
FALSE 1               0.1029316               -0.05425721                0.2148729
FALSE   r2_pred_secchi_jd_51m
FALSE 1             0.2179037

Model Performance - 3 day window

Keep in mind that all of these models seemed overfit in the train/test sets.

Model Performance - 5 day window

ggplot(preds_5, aes(x = secchi, y = pred_secchi_5d_3m)) + 
  geom_point() +
  geom_abline(color = 'grey', lty = 2) + 
  coord_cartesian(xlim = c(0, 6.5),
                  ylim = c(0,6.5)) +
  stat_poly_eq(aes(label = paste(after_stat(adj.rr.label))),
               formula = y~x, 
               parse = TRUE, 
               label.y = Inf, 
               vjust = 1.3) +
  labs(title = 'Yojoa Secchi xgboost Validation Data\nfive day matchups, band and 3-day met summaries', 
       subtitle = 'high data stringency\nweighted Secchi\ngrey dashed line is 1:1', 
       x = 'Actual Secchi (m)', 
       y = 'Predicted Secchi (m)')  +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
        plot.subtitle = element_text(hjust = 0.5))

ggplot(preds_5, aes(x = secchi, y = pred_secchi_5d_5m)) + 
  geom_point() +
  geom_abline(color = 'grey', lty = 2) + 
  coord_cartesian(xlim = c(0, 6.5),
                  ylim = c(0,6.5)) +
  stat_poly_eq(aes(label = paste(after_stat(adj.rr.label))),
               formula = y~x, 
               parse = TRUE, 
               label.y = Inf, 
               vjust = 1.3) +
  labs(title = 'Yojoa Secchi xgboost Validation Data\nfive day matchups, band and 5-day met summaries', 
       subtitle = 'high data stringency\nweighted Secchi\ngrey dashed line is 1:1', 
       x = 'Actual Secchi (m)', 
       y = 'Predicted Secchi (m)')  +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
        plot.subtitle = element_text(hjust = 0.5))

ggplot(preds_5, aes(x = secchi, y = pred_secchi_5d_51m)) + 
  geom_point() +
  geom_abline(color = 'grey', lty = 2) + 
  coord_cartesian(xlim = c(0, 6.5),
                  ylim = c(0,6.5)) +
  stat_poly_eq(aes(label = paste(after_stat(adj.rr.label))),
               formula = y~x, 
               parse = TRUE, 
               label.y = Inf, 
               vjust = 1.3) +
  labs(title = 'Yojoa Secchi xgboost Validation Data\nfive day matchups, band and 5/1-day met summaries', 
       subtitle = 'high data stringency\nweighted Secchi\ngrey dashed line is 1:1', 
       x = 'Actual Secchi (m)', 
       y = 'Predicted Secchi (m)')  +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
        plot.subtitle = element_text(hjust = 0.5))

ggplot(preds_5, aes(x = secchi, y = pred_secchi_5d_71m)) + 
  geom_point() +
  geom_abline(color = 'grey', lty = 2) + 
  coord_cartesian(xlim = c(0, 6.5),
                  ylim = c(0,6.5)) +
  stat_poly_eq(aes(label = paste(after_stat(adj.rr.label))),
               formula = y~x, 
               parse = TRUE, 
               label.y = Inf, 
               vjust = 1.3) +
  labs(title = 'Yojoa Secchi xgboost Validation Data\nfive day matchups, band and 7/1-day met summaries', 
       subtitle = 'high data stringency\nweighted Secchi\ngrey dashed line is 1:1', 
       x = 'Actual Secchi (m)', 
       y = 'Predicted Secchi (m)')  +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
        plot.subtitle = element_text(hjust = 0.5))

Model Performance - Jemma window

ggplot(preds_jd, aes(x = secchi, y = pred_secchi_jd_3m)) + 
  geom_point() +
  geom_abline(color = 'grey', lty = 2) + 
  coord_cartesian(xlim = c(0, 6.5),
                  ylim = c(0,6.5)) +
  stat_poly_eq(aes(label = paste(after_stat(adj.rr.label))),
               formula = y~x, 
               parse = TRUE, 
               label.y = Inf, 
               vjust = 1.3) +
  labs(title = 'Yojoa Secchi xgboost Validation Data\n7/1 day matchups, band and 3-day met summaries', 
       subtitle = 'high data stringency\nweighted Secchi\ngrey dashed line is 1:1', 
       x = 'Actual Secchi (m)', 
       y = 'Predicted Secchi (m)')  +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
        plot.subtitle = element_text(hjust = 0.5))

ggplot(preds_jd, aes(x = secchi, y = pred_secchi_jd_5m)) + 
  geom_point() +
  geom_abline(color = 'grey', lty = 2) + 
  coord_cartesian(xlim = c(0, 6.5),
                  ylim = c(0,6.5)) +
  stat_poly_eq(aes(label = paste(after_stat(adj.rr.label))),
               formula = y~x, 
               parse = TRUE, 
               label.y = Inf, 
               vjust = 1.3) +
  labs(title = 'Yojoa Secchi xgboost Validation Data\n7/1 day matchups, band and 5-day met summaries', 
       subtitle = 'high data stringency\nweighted Secchi\ngrey dashed line is 1:1', 
       x = 'Actual Secchi (m)', 
       y = 'Predicted Secchi (m)')  +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
        plot.subtitle = element_text(hjust = 0.5))

ggplot(preds_jd, aes(x = secchi, y = pred_secchi_jd_51m)) + 
  geom_point() +
  geom_abline(color = 'grey', lty = 2) + 
  coord_cartesian(xlim = c(0, 6.5),
                  ylim = c(0,6.5)) +
  stat_poly_eq(aes(label = paste(after_stat(adj.rr.label))),
               formula = y~x, 
               parse = TRUE, 
               label.y = Inf, 
               vjust = 1.3) +
  labs(title = 'Yojoa Secchi xgboost Validation Data\n7/1 day matchups, band and 5/1-day met summaries', 
       subtitle = 'high data stringency\nweighted Secchi\ngrey dashed line is 1:1', 
       x = 'Actual Secchi (m)', 
       y = 'Predicted Secchi (m)')  +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
        plot.subtitle = element_text(hjust = 0.5))

ggplot(preds_jd, aes(x = secchi, y = pred_secchi_jd_71m)) + 
  geom_point() +
  geom_abline(color = 'grey', lty = 2) + 
  coord_cartesian(xlim = c(0, 6.5),
                  ylim = c(0,6.5)) +
  stat_poly_eq(aes(label = paste(after_stat(adj.rr.label))),
               formula = y~x, 
               parse = TRUE, 
               label.y = Inf, 
               vjust = 1.3) +
  labs(title = 'Yojoa Secchi xgboost Validation Data\n7/1 day matchups, band and 7/1-day met summaries', 
       subtitle = 'high data stringency\nweighted Secchi\ngrey dashed line is 1:1', 
       x = 'Actual Secchi (m)', 
       y = 'Predicted Secchi (m)')  +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
        plot.subtitle = element_text(hjust = 0.5))

These all look terrible.

Applying model to full data

Local knowledge window does not perform well in this scenario. Best performing seems to be 5-day matchup and 3-day met summary.

features = band_met3_feats
model = optimized_booster_5d_3m
met = '3 day met summaries'
window = '5 day window'

save(optimized_booster_5d_3m, file = 'data/models/optimized_xg_10_5d_3m.RData')

full_stack <- read_csv('data/upstreamRS/yojoa_corr_rrs_met_scaled_v2023-06-15.csv') %>%
  mutate(secchi = 100) %>% 
    mutate(secchi = as.numeric(secchi)) %>% #there's one wonky value in here with two decimal points... dropping from this analysis
    filter(!is.na(secchi))

stack_xgb <- xgb.DMatrix(data = as.matrix(full_stack[,features]))

full_stack_simp <- full_stack %>%
  mutate(secchi = predict(model, stack_xgb)) %>%
  select(date, location, secchi, mission) 

situ_stack <- read_csv('data/in-situ/Secchi_completedataset.csv') %>%
  mutate(secchi = as.numeric(secchi),
         date = mdy(date)) %>%
  filter(!is.na(secchi)) %>%
  mutate(mission = 'Measured') %>%
  bind_rows(full_stack_simp)%>% 
  mutate(location = gsub(' ', '', location))

Let’s look at each of the site records alongside the Landsat-estimated Secchi depth.

FALSE [[1]]

FALSE 
FALSE [[2]]

FALSE 
FALSE [[3]]

FALSE 
FALSE [[4]]

FALSE 
FALSE [[5]]

FALSE 
FALSE [[6]]

FALSE 
FALSE [[7]]

FALSE 
FALSE [[8]]

FALSE 
FALSE [[9]]

FALSE 
FALSE [[10]]

FALSE 
FALSE [[11]]

FALSE 
FALSE [[12]]

FALSE 
FALSE [[13]]

FALSE 
FALSE [[14]]

FALSE 
FALSE [[15]]

FALSE 
FALSE [[16]]

FALSE 
FALSE [[17]]

FALSE 
FALSE [[18]]

Look at recent data per location

plotRecentBySite = function(site) {
  ggplot(situ_stack %>%
           filter(location == site), aes(x = date, y = secchi, color = mission,
                                        shape = mission)) + 
    geom_point() + 
    labs(title = paste0('Yojoa Secchi 2018-2022 - Site ', site),
         subtitle = paste0(window, ', ', met, '\nhigh data stringency, weighted Secchi'),
         y = 'Secchi (m)',
         color = 'data source', shape = 'data source') +
    scale_color_manual(values = c('grey10','grey30','grey50','grey70','blue')) + 
    theme_few() +
    theme(legend.position = c(0.8,0.8)) + 
    scale_shape_manual(values = c(19,19,19,19,1)) +
    scale_y_continuous(limits = c(0, max(situ_stack$secchi)), breaks = seq(0, max(situ_stack$secchi), 2)) +
    scale_x_date(limits = c(ymd('2018-01-01'), max(situ_stack$date))) +
    theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
          plot.subtitle = element_text(hjust = 0.5))
}

map(sort(unique(situ_stack$location)), plotRecentBySite)
FALSE [[1]]

FALSE 
FALSE [[2]]

FALSE 
FALSE [[3]]

FALSE 
FALSE [[4]]

FALSE 
FALSE [[5]]

FALSE 
FALSE [[6]]

FALSE 
FALSE [[7]]

FALSE 
FALSE [[8]]

FALSE 
FALSE [[9]]

FALSE 
FALSE [[10]]

FALSE 
FALSE [[11]]

FALSE 
FALSE [[12]]

FALSE 
FALSE [[13]]

FALSE 
FALSE [[14]]

FALSE 
FALSE [[15]]

FALSE 
FALSE [[16]]

FALSE 
FALSE [[17]]

FALSE 
FALSE [[18]]

Whole lake secchi dynamics

While there is plenty of variability across the lake, let’s summarize to a single value per date, since not all sites have the same density of record. Since there are a few oddballs in here (both in terms of measured and estimated), we’ll use the median Secchi across all sites.

lake_med <- situ_stack %>%
  group_by(date,mission) %>%
  summarize(across(where(is.numeric),median))

2006

Recent

ggplot(lake_med, aes(x = date, y = secchi, color = mission, shape = mission)) + 
  geom_point() + 
  scale_color_manual(values = c('grey10','grey30','grey50','grey70','blue')) + 
  theme_few() +
  labs(title = 'Yojoa Secchi 2018-2022\nwhole-lake median',
         subtitle = paste0(window, ', ', met, '\nhigh data stringency, weighted Secchi'),
       y = 'median Secchi (m)',
       color = 'data source', shape = 'data source') +
  theme(legend.position = c(0.8,0.8)) + 
  scale_shape_manual(values = c(19,19,19,19,1)) +
  scale_x_date(limits = c(as.Date('2018-01-01'), as.Date('2023-01-01')))+
  theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
        plot.subtitle = element_text(hjust = 0.5))